
clear all

*-------------------------------------------------
// STEP 1: Load data on households and individuals 
*-------------------------------------------------

use "${datain}househ_wp2" , clear

rename w17_1455_lnr indid
rename aargang year

keep indid /// person-id
	kvinne /// 1 if woman
	antpers_i_regstat_famnr /// number of individuals per hh (adults + children)
	year /// year
	familienr /// family-id
	age /// age
	grkrets /// household location
	utdkode /// education code - NUS2000 
	wies /// income net of taxes. NOK
	regtype1  /// region type (city, urban, etc.)     
	grputd /// aggregated education 
	sel_utdgrp /// selected education groups    
	employed /// 1 == Yrkesinntekt above 1G
	skilled   /// Based on either education or employment codes, not sure
	wealth   /// wealth, NOK
	hushtyp   /// household type
	secondhome /// owns a second home
	retired /// 1 == recieving age pension
	nr_adults /// number of adults in the household, not counting children above 18
	house /// 1 == owns their own house    
	children /// 1 == lives with children below 18
	adult /// 1 == above 18 and not living with parents 

*-------------------------------------------------
// STEP 2: Merge with data on workplace
*-------------------------------------------------	

destring year, replace

merge m:1 indid year using "${datain}work_grk_2005_2017", keep(match master)
drop _merge orgnrbed orgnrfor

*-------------------------------------------------
// STEP 3: Add info on time, distance, toll and ptl to grk-grk-pairs 
*-------------------------------------------------	

preserve
	use "${datain}distances", clear
	reshape long takst_r, i(grk1 grk2) j(year)
	rename takst_r toll
	rename weight time
	rename s dist
	rename kfelt ptl
	keep grk1 grk2 year toll time dist ptl
	tempfile grk1grk2
	save `grk1grk2'
restore

destring grkrets, gen(grk1)
destring grk_bed, gen(grk2)

/* to work */
merge m:1 grk1 grk2 year using `grk1grk2', keep(match master)
drop _merge
rename toll toll2
rename time time2
rename dist dist2
rename ptl ptl2

/* changing the order of grks */
rename grk2 temp
rename grk1 grk2
rename temp grk1

/* from work */
merge m:1 grk1 grk2 year using `grk1grk2', keep(match master)
drop _merge
rename toll toll1
rename time time1
rename dist dist1
rename ptl ptl1

drop grk1 grk2

/* Creating averages of to/from work, where available */
foreach var in time dist ptl toll {
	gen `var' = (`var'1 + `var'2) / 2
	replace `var' = `var'1 if `var' == .
	replace `var' = `var'2 if `var' == .
	drop `var'1 `var'2
}

replace dist = dist / 1000 /* Measure in km */
replace ptl = ptl   / 1000 /* Measure in km */
compress

*-------------------------------------------------
// STEP 4: Reshape to wide, to create hh level data
*-------------------------------------------------	

/* 
Define (up to) two "main" household members
This step is only relevant for households consisting of three or more adults. 
Selection criteria chosen such that we avoid selecting seniors living with
their children, and avoid selecting older children still living with their
parents.

Criteria:
	Member 1:
		- Male, if available
		- With work, if available
		- Oldest
	Member 2:
		- Female, if available
		- Otherwise, sorted the same way as member 1
*/

// How many observations do we have per family? 
bys familienr year: egen antall_i_data = count(kvinne) 

// Some households have no adults. In these cases everyone above 18 is defined as adult 
replace adult = 1 if nr_adults == 0 

// Drop all children above 18 in the same household. 
drop if adult == 0

//  Making sure that by family id-year-gender, the employed is selected first, then the oldest individual 
gsort familienr year kvinne -employed -age

// The idea here is to make a priority list of which hh members to include - include the ("man") and ("woman") with the lowest number 
bys familienr year kvinne: gen first_ind = _n 
gen second_ind = first_ind if kvinne == 1
replace first_ind = . if kvinne == 1

// Now, first_ind is a prioritized list of males, second_ind is a prioritized list of females, per hh-year */
drop antall_i_data
bys familienr year: egen antall_i_data = count(kvinne) /* how many? */
bys familienr year: egen antall_first = count(first_ind) /* how many males? */
bys familienr year: egen antall_second = count(second_ind) /* how many females? */

/*  Next steps are for dealing with same sex couples */
/* Making the second first_ind a second_ind in two-male households */
replace second_ind = 1 /// making a second_ind
	if antall_i_data >= 2 /// if there are >= 2 hh members ...
	& antall_second == 0 /// and none of them are women ...
	& antall_first == antall_i_data /// and # of hhmembers == # of first_ind
	& first_ind == 2 // choosing the "second best" first_ind to be the second_ind
replace first_ind = . if second_ind == 1 // and removing his first_ind status

/* Making the first second_ind a first_ind in two-female hhs */
replace first_ind = 1 ///
	if antall_i_data >= 2 ///
	& antall_first == 0 ///
	& antall_second == antall_i_data ///
	& second_ind == 1
replace second_ind = . if first_ind == 1	

/* Making the second second_ind the first second_ind in two-female hhs */
replace second_ind = 1 ///
	if antall_i_data >= 2 ///
	& antall_first == 0 ///
	& antall_second == antall_i_data ///
	& second_ind == 2

drop antall_first antall_second	

drop antall_i_data
gen index = . /* Creating variable for reshaping */
replace index = 1 if first_ind == 1
replace index = 2 if second_ind == 1

// Dropping household members that are neither first_ind ("man") nor second_ind ("woman") 
drop if index == . 

bys year familienr: egen aux = count(index)
gen couple = (aux == 2) /* "Couple" here defined as "at least" two adults in the household */
drop aux

drop hushtyp adult nr_adults first_ind second_ind

/* Fixing cases where household level variables do not match within each couple */
bys familienr year: egen aux = max(children)
replace children = aux
drop aux

bys familienr year: egen aux = max(antpers_i_regstat_famnr)
replace antpers_i_regstat_famnr = aux
drop aux

bys familienr year (index): replace grkrets = grkrets[_n-1] if index == 2 & couple == 1

drop regtype1  //  --> attach later 
drop sel_utdgrp house

*-------------
// Reshape
*-------------
reshape wide indid kvinne age wies ///
	employed wealth retired grk_bed time dist ptl toll ///
	utdkode grputd  skilled ///
	, i(familienr year) j(index)
		
/* 
NOTE: Regarding the female ("kvinne") variable - only variation comes from same sex couples:
	- kvinne1 == 1 iff both (all) are female
	- kvinne2 == 0 iff both (all) are male
*/	

/*  Setting travel to work variables = 0 if
	A: working & living in same grk, or
	B: person is not employed. 
	
	Otherwise, keep it as missing */

forvalues i = 1/2 {
	foreach var in time dist ptl toll {
		replace `var'`i' = 0 if indid`i' != "" ///
			& `var'`i' == . ///
			& grkrets == grk_bed`i'
		replace `var'`i' = 0 if indid`i' != "" ///
			& `var'`i' == . ///
			& employed`i' == 0
	}
}

order indid2 couple children antpers_i_regstat_famnr secondhome grkrets ///
	grk_bed1 grk_bed2 time1 time2 dist1 dist2 toll1 toll2 ptl1 ptl2 ///
	age1 age2 kvinne1 kvinne2 employed1 employed2 ///
	retired1 retired2 wies1 wies2 wealth1 wealth2, a(indid1)
	
*-------------------------------------------------
// STEP 5: Adding car data to the household data
*-------------------------------------------------	

/* 5A: retrieve household-individual link from base data */
preserve
	use w17_1455_lnr familienr aargang using "${datain}househ_select", clear
	destring aargang, gen(year)
	drop aargang
	rename w17_1455_lnr ownerid
	tempfile ind_link_to_hh
	save `ind_link_to_hh'
	
/* 5B: merge household id to individual id variable in car dataset */
	use "${datain}car_stock_end_of_year", clear
	compress
	// Drop cars owned by firms
	drop if ownertype == 0
	drop ownertype

	// We use a sample of car ownership status at 31.12 each year.
	// Therefore, only keep cars that are part of the fleet at the end of the year
	drop if car_dis == 1 // scrapped cars
	drop if car_dis == 2 // exported cars
	drop car_dis

	// Only keep valid cars (no motorcycles, taxis, hearses, scooters, etc.)
	drop if car_valid == 0
	// Keep relevant car variables  
	keep year ownerid carid car_acq kmperday days daysused freg_DMY fuel
	
	//  Merge with family id
	merge m:1 ownerid year using `ind_link_to_hh', keep(match master)
	
	compress
	tab year _merge
	drop _merge ownerid
	drop if familienr == "" /* non-matches */
	order familienr carid, a(year)
	sort familienr year carid
	rename days daysowned
	drop if daysused == 0
	drop if daysowned == 0	
	
	/* Variable used for merging cars to households in prioritized order 
	   freg_DMY is first registration date. I.e. sorting after car age */
	bys year familienr (freg_DMY): gen fam_car_nr = _N - _n + 1 ///
		if familienr != "" & daysused != 0
	compress
	
	/* Creating `cars' number of datasets to merge on the household data */
	local cars 3 /* How many cars to keep data on */
	scalar kmperday_min = 0 /* Minimum annual VKT allowed before it couts as missing */
	scalar kmperday_max = 200 /* Maximum annual VKT allowed before it couts as missing */
	
	* Fixing invalid VKT observations
	replace kmperday = . if kmperday < kmperday_min
	replace kmperday = . if kmperday > kmperday_max
	/* NOTE: kmperday defined as VKT split equally between days in which the car is registered, "daysused" */
	drop daysused daysowned

	tempfile cardata_w_familyid
	save `cardata_w_familyid'
restore

/* 5C: merging on cars in `cars' number of steps */
forvalues i = 1/`cars' {
	di `i'
	gen fam_car_nr = `i'
	merge 1:1 familienr year fam_car_nr ///
		using `cardata_w_familyid' ///
		, keep(match master)
	drop _merge fam_car_nr
	foreach var in carid car_acq kmperday /*daysowned daysused*/ freg_DMY fuel {
		rename `var' `var'`i'
	} 
}

compress

*-------------------------------------------
save "${datatemp}MainDataset", replace		
*-------------------------------------------



*-------------------------------------------------
// STEP 6: Make adjustments to dataset
*-------------------------------------------------	

//gen fylkenr= substr(grkrets,1,2)

*==== Drop variables I don't need to speed up the process
drop   car_acq1 car_acq2 car_acq3 ///
/* daysused1 daysused2  daysused3  daysowned1 daysowned2  daysowned3 */ ///
freg_DMY1 freg_DMY2  freg_DMY3  

*======== Drop distance outliers  ========
keep if dist1<=150 | dist1==.     /*Drop if distance to work is >150 km*/
keep if dist2<=150 | dist2==.     /*Drop if distance to work is >150 km*/

*======== Drop time outliers  ========
keep if time1<=120 | time1==.     /*Drop if time > 2 hours (120 minutes)*/
keep if time2<=120 | time2==.     /*Drop if time > 2 hours (120 minutes)*/

*==== Destring FE variables
destring grkrets, gen(grkrets_num)
destring grk_bed1, gen(grk_bed1_num)
destring grk_bed2, gen(grk_bed2_num)

*==== Show income in 1000 NOK
forvalues i=1(1)2 {
replace wies`i'=wies`i'/1000
replace wealth`i'=wealth`i'/1000
}

*==== Income and wealth deciles (by year)
forvalues i=1(1)2 {
egen wies`i'_decile = xtile(wies`i'),  by(year) nq(10)
egen wealth`i'_decile = xtile(wealth`i'),  by(year) nq(10)
}


*======== Generate HH-level variables  ========

*== BEV ownership  - Yes/No  (missing indicates no car)
gen BEV_fam_yes=0
replace BEV_fam_yes=1 if fuel1==5 | fuel2==5| fuel3==5 



// Number of BEV. (Assume missing indicates no car)
forvalues i=1/3 {
gen BEV`i'=0
replace BEV`i'=1 if fuel`i'==5
}
egen BEV_fam_count = rowtotal(BEV1 BEV2 BEV3) , missing
drop BEV1-BEV3
sum BEV_fam_count 


*==  Number of cars
forvalues i=1/3 {
gen car`i'=0
replace car`i'=1 if carid`i'!=""
}
egen car_fam_count = rowtotal(car1 car2 car3) , missing
drop car1-car3
sum car_fam_count


*== Number of ICE cars (incl. hybrid). non-BEV
forvalues i=1/3 {
gen car`i'=0
replace car`i'=1 if carid`i'!=""
replace car`i'=0 if fuel`i'==5	 /*BEV*/
replace car`i'=0 if fuel`i'==6   /*Hydrogen*/
}
egen ICE_fam_count = rowtotal(car1 car2 car3) , missing
drop car1-car3
sum ICE_fam_count car_fam_count


*== kmperday
egen km_fam_sum = rowtotal(kmperday1 kmperday2 kmperday3) , missing   /*Include missing option to ensure "." is not treated as 0*/
egen km_fam_mean = rowmean(kmperday1 kmperday2 kmperday3)


// Family has at least 1 car
gen car_fam_yes=0
replace car_fam_yes=1 if car_fam_count>=1
replace car_fam_yes=. if car_fam_count==.


// Declare as panel data
egen familienr_num=group(familienr)
xtset familienr_num year


/* Delete this?
*/
gen BEV_fam_yes_NEW=d.BEV_fam_yes
replace BEV_fam_yes_NEW=0 if BEV_fam_yes_NEW==-1
replace BEV_fam_yes_NEW=0 if BEV_fam_yes_NEW==. & BEV_fam_yes==0




*============================================================================
*======= Merge dataset with public transit information at grunnkrets level
*============================================================================

// From residence to work
preserve
use "${datain}kollektivtransport_grk" , clear
rename FRA grkrets_num /* from residence */
rename TIL grk_bed1_num /* to workplace 1 */
sort grkrets_num grk_bed1_num
tempfile public1
save `public1'
rename grk_bed1_num grk_bed2_num /* to workplace 2 */
sort grkrets_num grk_bed2_num
tempfile public2
save `public2'
restore


sort  grkrets_num grk_bed1_num
merge m:m grkrets_num grk_bed1_num using  `public1'
drop if _merge==2
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
rename `v' `v'_1 
}


sort  grkrets_num grk_bed2_num
merge m:m grkrets_num grk_bed2_num using  `public2'
drop if _merge==2
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
rename `v' `v'_2
}





// From work to area of residence
preserve
use "${datain}kollektivtransport_grk" , clear
rename TIL grkrets_num   /* to residence */
rename FRA grk_bed1_num  /* from workplace 1 */
sort grkrets_num grk_bed1_num
tempfile public1Rev
save `public1Rev'
rename grk_bed1_num grk_bed2_num /* from workplace 2 */
sort grkrets_num grk_bed2_num
tempfile public2Rev
save `public2Rev'
restore

sort  grkrets_num grk_bed1_num
merge m:m grkrets_num grk_bed1_num using  `public1Rev'
drop if _merge==2
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
replace  `v'_1 =`v' if `v'_1==.
}
drop `varlist'

sort  grkrets_num grk_bed2_num
merge m:m grkrets_num grk_bed2_num using  `public2Rev'
drop if _merge==2
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
replace  `v'_2 =`v' if `v'_2==.
}
drop `varlist'



*======== Make adjustments to public transit variables and generate new ones ================

// NB: If home and work is located in same "grunnkrets" --> Public travel distance is missing!
// --> Need to replace . by 0
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
forvalues i=1(1)2 {
replace `v'_`i' = 0 if `v'_`i'==. & grkrets_num==grk_bed`i'_num
}
}

*=== Drop observations where grunnkrets residence is missing
drop if grkrets_num==.  

// Generate one value per row  
egen employed = rowmax(employed1 employed2) 
egen grk_bed_num  = rowmax(grk_bed1_num  grk_bed2_num) 

*=== Keep observations with at least one employed adult in the HH
keep if employed==1    

*=== Drop single employed HH where grunnkrets work information is missing
drop if couple==0 & employed==1  & grk_bed_num==.  

*=== For employed adult in couple HH: drop if grk_bed_num is missing
drop if couple==1 & employed1==1  & grk_bed1_num==.  
drop if couple==1 & employed2==1  & grk_bed2_num==.  

*=== For all non-employed in couple HH: overwrite grunnkrets work to 9999
replace grk_bed1="9999"   if couple==1 & employed1==0  
replace grk_bed1_num=9999 if couple==1 & employed1==0  
*
replace grk_bed2="9999"   if couple==1 & employed2==0  
replace grk_bed2_num=9999 if couple==1 & employed2==0  

*=== For all non-employed in couple HH: overwrite all ind-work controls
forvalues i=1/2{
replace time`i'=0  if couple==1 & employed`i'==0  
replace dist`i'=0  if couple==1 & employed`i'==0 
replace toll`i'=0  if couple==1 & employed`i'==0 
replace ptl`i'=0   if couple==1 & employed`i'==0 
}

forvalues i=1/2{
replace CD_TID_`i'=0     if couple==1 & employed`i'==0  
replace TILBTID_`i'=0    if couple==1 & employed`i'==0 
replace OMBORDTID_`i'=0  if couple==1 & employed`i'==0 
replace VENTETID_`i'=0   if couple==1 & employed`i'==0 
replace BOARDINGS_`i'=0  if couple==1 & employed`i'==0 
replace TAKST_`i'=0      if couple==1 & employed`i'==0 
}

*=== Calculate household level data (after adjustments above)

*=== 1) Toll
//capt drop  toll_fam_mean toll_fam_sum toll_fam_yes toll_fam_max

egen toll_fam_mean = rowmean(toll1 toll2)
egen toll_fam_sum = rowtotal(toll1 toll2), missing  /*Include missing option to ensure "." is not treated as 0*/

gen toll_fam_yes=0
replace toll_fam_yes=1 if toll_fam_sum>0 & toll_fam_sum!=.
replace toll_fam_yes=. if toll_fam_sum==.

egen toll_fam_max = rowmax(toll1 toll2)
replace toll_fam_max=. if toll1==. & toll2==.


*=== 2) public transport lane
// capt drop ptl_fam_km_mean  ptl_fam_yes_mean  ptl_fam_yes_500m ptl1_pct ptl2_pct ptl_fam_pct ptl_fam_yes

egen ptl_fam_km_mean=rowmean(ptl1 ptl2)
egen ptl_fam_km_sum=rowtotal(ptl1 ptl2) , missing


*== Public lanes: yes/no (ptl: km med kollektivfelt)
forvalues i=1(1)2 {
gen ptl`i'_yes=0
replace ptl`i'_yes=1 if ptl`i'>0 
replace ptl`i'_yes=. if ptl`i'==.    
}
egen ptl_fam_sum=rowtotal(ptl1_yes ptl1_yes)  , missing
gen ptl_fam_yes=0
replace  ptl_fam_yes=1 if ptl_fam_sum>0 
replace ptl_fam_yes=. if ptl_fam_sum==.
drop ptl_fam_sum

*== Public lanes: mean of yes/no dummies within household (ptl: km med kollektivfelt)
egen ptl_fam_yes_mean=rowmean(ptl1_yes ptl2_yes)
drop ptl1_yes ptl2_yes

*== Public lanes: yes/no (at least 500 meters)
forvalues i=1(1)2 {
gen ptl`i'_yes_500m=0
replace ptl`i'_yes_500m=1 if ptl`i'>0.5 
replace ptl`i'_yes_500m=. if ptl`i'==.    
}
gen ptl_fam_yes_500m=0
replace  ptl_fam_yes_500m=1 if ptl1_yes_500m==1 |  ptl2_yes_500m==1
replace ptl_fam_yes_500m=. if  ptl1_yes_500m==. &  ptl2_yes_500m==.
drop ptl1_yes_500m ptl2_yes_500m

*== Public lanes: percent of total distance
forvalues i=1(1)2 {
gen ptl`i'_pct= ptl`i' / dist`i' if  dist`i'!=. & ptl`i'!=.
}
egen ptl_fam_pct=rowmean(ptl1_pct  ptl2_pct)

*==== public transit variables (remove VENTETID)

forvalues i=1(1)2 {
** 1) Total travel time
gen PublicTransitTime_`i' = TILBTID_`i' + OMBORDTID_`i'  /*+ VENTETID_`i'*/

** 2) Total travel time public/car
/*Change ratio from . to 1 if individual work and live in same grunnkrets*/
gen PublicVSCarTime_`i' = PublicTransitTime_`i' / CD_TID_`i'
replace PublicVSCarTime_`i' = 1 if PublicTransitTime_`i'==0 & CD_TID_`i'==0   

** 3) Total travel time public - total travel time car
gen PublicDiffCarTime_`i' = PublicTransitTime_`i' - CD_TID_`i'

** 4) Dummy variable (0/1) if access to Public transit 
gen PublicTransitYes_`i' = 0
replace PublicTransitYes_`i'=1 if PublicTransitTime_`i'!=.   
}

// Make average family values
egen PublicVSCarTime_fam_mean = rowmean(PublicVSCarTime_1 PublicVSCarTime_2) 
egen PublicDiffCarTime_fam_mean = rowmean(PublicDiffCarTime_1 PublicDiffCarTime_2) 

// Drop redundant variables
capt drop employed grk_bed_num
capt drop dup

//  Create inflation adjusted variables
sort year
merge m:1 year using "${datain}KPI" , nogen keep(1 3)

local varlist toll1 toll2 wies1 wies2 wealth1 wealth2 TAKST_1 TAKST_2 toll_fam_mean toll_fam_sum toll_fam_max
foreach v in `varlist' {
gen `v'_KPI = `v'/KPI_2017 * 100
}

// Only keep KPI adjusted variables
local varlist toll1 toll2 wies1 wies2 wealth1 wealth2 TAKST_1 TAKST_2 toll_fam_mean toll_fam_sum toll_fam_max
foreach v in `varlist' {
capt drop `v'
rename `v'_KPI `v'
label var `v' "Inflation adjusted (2017 values)"
}

// Make link between grunnkrets and region type 
sort grkrets
merge m:1 grkrets using  "${datain}Link_grkrets_regtype1" , keep(1 3) nogen

*-------------------------------------------------------
compress
save    "${dataout}MainDataset" , replace
*-------------------------------------------------------

